This file generates plots of embeddings of pancreas and iPSC dataset with standard PCA+UMAP preprocessing pipeline from scanpy (which metadata covariates explain most of the variation found in these datasets? Does the embedding capture the pseudotime ordering estimated with RNA velocity analysis?)
import numpy as np
import pandas as pd
import scanpy as sc
import scvelo as scv
import matplotlib.pyplot as plt
#pip install leidenalg
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.20.3 scipy==1.7.1 pandas==1.3.3 scikit-learn==1.0 statsmodels==0.13.0 python-igraph==0.9.7 louvain==0.7.0 pynndescent==0.5.4
pwd
'/home/jovyan'
results_file = 'write/pancreas.h5ad' # the file that will store the analysis results
Read in the count matrix into an AnnData object
pancreasData=scv.datasets.pancreas()
pancreasData
AnnData object with n_obs × n_vars = 3696 × 27998
obs: 'clusters_coarse', 'clusters', 'S_score', 'G2M_score'
var: 'highly_variable_genes'
uns: 'clusters_coarse_colors', 'clusters_colors', 'day_colors', 'neighbors', 'pca'
obsm: 'X_pca', 'X_umap'
layers: 'spliced', 'unspliced'
obsp: 'distances', 'connectivities'
sc.pl.highest_expr_genes(pancreasData, n_top=20, )
normalizing counts per cell
finished (0:00:00)
sc.pp.filter_cells(pancreasData, min_genes=200)
sc.pp.filter_genes(pancreasData, min_cells=3)
filtered out 12261 genes that are detected in less than 3 cells
pancreasData
AnnData object with n_obs × n_vars = 3696 × 15737
obs: 'clusters_coarse', 'clusters', 'S_score', 'G2M_score', 'n_genes'
var: 'highly_variable_genes', 'n_cells'
uns: 'clusters_coarse_colors', 'clusters_colors', 'day_colors', 'neighbors', 'pca'
obsm: 'X_pca', 'X_umap'
layers: 'spliced', 'unspliced'
obsp: 'distances', 'connectivities'
pancreasData.var['mt'] = pancreasData.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(pancreasData, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
pancreasData.var['mt'] = pancreasData.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(pancreasData, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(pancreasData, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
jitter=0.4, multi_panel=True)
sc.pl.violin(pancreasData, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
jitter=0.4, multi_panel=True)
sc.pl.violin(pancreasData, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
jitter=0.4, multi_panel=True)
sc.pl.scatter(pancreasData, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(pancreasData, x='total_counts', y='n_genes_by_counts')
pancreasData = pancreasData[pancreasData.obs.n_genes_by_counts < 2500, :]
pancreasData = pancreasData[pancreasData.obs.pct_counts_mt < 5, :]
sc.pp.normalize_total(pancreasData, target_sum=1e4)
normalizing counts per cell
finished (0:00:00)
sc.pp.log1p(pancreasData)
sc.pp.highly_variable_genes(pancreasData, min_mean=0.0125, max_mean=3, min_disp=0.5)
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
sc.pl.highly_variable_genes(pancreasData)
Run PCA
sc.tl.pca(pancreasData, svd_solver='arpack')
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:00)
pancreasData.var['highly_variable']
index
Xkr4 False
Mrpl15 False
4732440D04Rik False
Gm26901 False
Sntg1 False
...
Ddx3y True
Kdm5d False
Eif2s3y True
Gm29650 False
Erdr1 False
Name: highly_variable, Length: 15737, dtype: bool
sc.pl.pca(pancreasData, color="Ddx3y")
sc.pl.pca(pancreasData, color="Xkr4") #not a highly variable gene
sc.pl.pca_variance_ratio(pancreasData, log=True)
sc.pp.neighbors(pancreasData, n_neighbors=10, n_pcs=40)
computing neighbors
using 'X_pca' with n_pcs = 40
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
sc.tl.leiden(pancreasData)
#sc.pl.paga(pancreasData, plot=False) # remove `plot=False` if you want to see the coarse-grained graph
sc.tl.umap(pancreasData)
running Leiden clustering
finished: found 14 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:03)
sc.pl.umap(pancreasData, color=['Ddx3y'])
pancreasData.var['highly_variable_genes']
index
Xkr4 False
Mrpl15 False
4732440D04Rik False
Gm26901 False
Sntg1 True
...
Ddx3y True
Kdm5d False
Eif2s3y True
Gm29650 False
Erdr1 True
Name: highly_variable_genes, Length: 15737, dtype: category
Categories (2, object): ['False', 'True']
Bonemarrow Dataset
bonemarrow=scv.datasets.bonemarrow()
sc.pl.highest_expr_genes(bonemarrow, n_top=20, )
normalizing counts per cell
finished (0:00:00)
sc.pp.normalize_total(bonemarrow, target_sum=1e4)
normalizing counts per cell
finished (0:00:00)
sc.pp.filter_cells(bonemarrow, min_genes=200)
sc.pp.filter_genes(bonemarrow, min_cells=3)
filtered out 1782 genes that are detected in less than 3 cells
bonemarrow.var['mt'] = bonemarrow.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(bonemarrow, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(bonemarrow, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
jitter=0.4, multi_panel=True)
sc.pl.scatter(bonemarrow, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(bonemarrow, x='total_counts', y='n_genes_by_counts')
bonemarrow= bonemarrow[bonemarrow.obs.n_genes_by_counts < 2500, :]
bonemarrow = bonemarrow[bonemarrow.obs.pct_counts_mt < 5, :]
sc.pp.log1p(bonemarrow)
sc.pp.highly_variable_genes(bonemarrow, min_mean=0.0125, max_mean=3, min_disp=0.5)
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
sc.pl.highly_variable_genes(bonemarrow)
sc.tl.pca(bonemarrow, svd_solver='arpack')
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:00)
bonemarrow.var['highly_variable']
index
FO538757.1 True
FAM41C False
RP11-54O7.3 False
NOC2L False
HES4 False
...
USP9Y False
DDX3Y False
TMSB4Y False
EIF1AY False
RPS4Y2 True
Name: highly_variable, Length: 12537, dtype: bool
x1=sc.pl.pca(bonemarrow, color="FO538757.1", sort_order=True)
x2=sc.pl.pca(bonemarrow, color="FO538757.1", sort_order=False)
y1=sc.pl.pca(bonemarrow, color="USP9Y", sort_order=False)
y2=sc.pl.pca(bonemarrow, color="USP9Y", sort_order=True)
bonemarrow
AnnData object with n_obs × n_vars = 320 × 12537
obs: 'clusters', 'palantir_pseudotime', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
uns: 'clusters_colors', 'log1p', 'hvg', 'pca'
obsm: 'X_tsne', 'X_pca'
varm: 'PCs'
layers: 'spliced', 'unspliced'
sc.pl.pca_variance_ratio(bonemarrow, log=True)
sc.pp.neighbors(bonemarrow, n_neighbors=10, n_pcs=40)
computing neighbors
using 'X_pca' with n_pcs = 40
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
sc.tl.leiden(bonemarrow)
#sc.pl.paga(pancreasData, plot=False) # remove `plot=False` if you want to see the coarse-grained graph
sc.tl.umap(bonemarrow)
running Leiden clustering
finished: found 10 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:01)
sc.pl.umap(bonemarrow, color=['FO538757.1']) #highly variable gene